In [1]:
import numpy as np
import numpy.random as npr
import pylab as pl

In [2]:
%matplotlib inline

Overview of algorithm:

  • Compute clusters
  • Connect clusters

Computing clusters: They use an estimate of local density to down-sample the densest regions of the data. They then

  1. Estimating local density:
    • In the paper, they estimate density by computing how many points are within a threshold L1-distance. This is weird:
      • Why L1 distance / why not L2 (Euclidean)?
    • Alternatives:
      • Also measure population within a threshold radius, but w.r.t. a different $p$-norm: e.g. $p$=2
      • Average L1/L2 distance to all $k$-nearest neighbors
      • Kernel density estimator: use cross-validation to select bandwidth
  2. Down-sampling:

  3. Clustering:
  4. Up-sampling:

In [3]:
from sklearn import neighbors
from sklearn.neighbors import KernelDensity

In [9]:
def generate_blobs(num_samples=5000,separation=8):
    centers = np.array([[0,0],[1,0],[0,1],[1,1]],dtype=float)
    centers -= 0.5
    centers = np.vstack((centers,#centers*2,centers*3,
                         #centers*4,centers*5,centers*6,
                         #centers*7,centers*8,centers*9,
                         #centers*10,centers*11,centers*12,
                         #centers*13,centers*14,
                         [0,0]))
    centers *= separation
    kde = KernelDensity()
    kde.fit(centers)
    samples = kde.sample(num_samples)
    density = np.exp(kde.score_samples(samples))
    return samples,density

In [32]:
samples,density = generate_blobs(5000,10)

In [33]:
pl.scatter(samples[:,0],samples[:,1],
           c=density,linewidth=0)
pl.axis('off')


Out[33]:
(-15.0, 10.0, -10.0, 15.0)

In [61]:
def local_density_k(X,k=10,metric=None):
    if metric != None:
        bt = neighbors.BallTree(X,200,metric=metric)
        neighbor_graph = neighbors.kneighbors_graph(X,k,'distance')
    else:
        neighbor_graph = neighbors.kneighbors_graph(X,k,'distance')
    distances = np.array(neighbor_graph.mean(1))[:,0]
    #return np.exp(distances) * np.exp(distances.max())
    return np.log(distances+1)
    #return np.exp(1-((distances - distances.min())/(distances.max() - distances.min())))

In [52]:
def local_density_r(X,r=0.1,metric=None):
    if metric != None:
        bt = neighbors.BallTree(X,200,metric=metric)
        neighbor_graph = neighbors.radius_neighbors_graph(bt,r)
    else:
        neighbor_graph = neighbors.radius_neighbors_graph(X,r)
    counts = np.array(neighbor_graph.sum(1))[:,0]
    return ((counts - counts.min())/(counts.max() - counts.min()))

In [53]:
pl.scatter(samples[:,0],samples[:,1],
           c=local_density_r(samples,1.5,'l1'),linewidth=0)
pl.axis('off')


Out[53]:
(-15.0, 10.0, -10.0, 15.0)

In [54]:
pl.scatter(samples[:,0],samples[:,1],
           c=local_density_r(samples,1.5),linewidth=0)
pl.axis('off')


Out[54]:
(-15.0, 10.0, -10.0, 15.0)

In [59]:
pl.scatter(samples[:,0],samples[:,1],
           c=local_density_k(samples,10),linewidth=0)
pl.axis('off')


Out[59]:
(-15.0, 10.0, -10.0, 15.0)

In [56]:
pl.scatter(samples[:,0],samples[:,1],
           c=local_density_k(samples,5,'l1'),linewidth=0)
pl.axis('off')


Out[56]:
(-15.0, 10.0, -10.0, 15.0)

In [18]:
from sklearn.linear_model import LinearRegression

In [60]:
k = [1,5,10,50,100,200]
for i in range(6):
    pl.subplot(3,2,i+1)
    est_density = local_density_k(samples,k[i])
    pred_density = est_density
    #lr = LinearRegression()
    #X = np.reshape(est_density,(len(density),1))
    #lr.fit(X,density)
    #pred_density = lr.predict(X)
    pl.scatter(pred_density,density,
               #c=abs(density-pred_density),
               #linewidth=0,
               #cmap='spectral',
               alpha=0.5
               )
    pl.title('k={0}'.format(k[i]))
    pl.axis('off')



In [20]:
def plot_estimator_error(samples,density,estimator,params,
                         param_name='r'):
    for i in range(6):
        pl.subplot(3,2,i+1)
        est_density = estimator(samples,params[i])
        pred_density = est_density
        #lr = LinearRegression()
        #X = np.reshape(est_density,(len(density),1))
        #lr.fit(X,density)
        #pred_density = lr.predict(X)
        pl.scatter(pred_density,density,
                   c='blue',
                   #c=abs(density-pred_density),
                   #linewidth=0,
                   #cmap='spectral',
                   alpha=0.5,
                   )
        pl.title(param_name+'={0}'.format(params[i]))
        pl.axis('off')

r = [0.01,0.1,0.25,0.5,1.0,2.0]
plot_estimator_error(samples,density,local_density_r,r,'r')
pl.figure()
k = [1,5,10,50,100,200]
plot_estimator_error(samples,density,local_density_k,k,'k')



In [21]:
ld_r_l1 = lambda X,r : local_density_r(X,r,metric='l1')
ld_k_l1 = lambda X,k : local_density_k(X,k,metric='l1')

r = [0.01,0.1,0.25,1.0,2.0,10.0]
plot_estimator_error(samples,density,ld_r_l1,r,'r')
pl.figure()
k = [1,5,10,50,100,200]
plot_estimator_error(samples,density,ld_k_l1,k,'k')



In [24]:
# let's try a higher dimensional example
centers = npr.randn(100,10)*10
kde = KernelDensity()
kde.fit(centers)
samples = kde.sample(5000)
density = np.exp(kde.score_samples(samples))

In [25]:
k = [1,5,10,50,100,200]
plot_estimator_error(samples,density,local_density_k,k,'k')



In [26]:
k = [1,5,10,20,30,50]
plot_estimator_error(samples,density,ld_k_l1,k,'k')



In [27]:
r = [2.0,3.0,5.0,6.0,7.0,10.0]
plot_estimator_error(samples,density,local_density_r,r,'r')



In [28]:
r = [3.0,5.0,6.0,7.0,10.0,12.0]
plot_estimator_error(samples,density,ld_r_l1,r,'r')



In [29]:
from sklearn.grid_search import GridSearchCV

# this snippet from documentation: 
# http://scikit-learn.org/stable/auto_examples/neighbors/plot_digits_kde_sampling.html
params = {'bandwidth': np.logspace(-1, 1, 20)}
grid = GridSearchCV(KernelDensity(), params)
grid.fit(samples)

print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))

# use the best estimator to compute the kernel density estimate
kde = grid.best_estimator_


best bandwidth: 0.8858667904100825

In [25]:
pred_density = np.exp(kde.score_samples(samples))

In [26]:
pl.scatter(pred_density,density)
pl.xlabel('predicted density')
pl.ylabel('actual density')
#pl.axis('off')


Out[26]:
<matplotlib.text.Text at 0x10f993cd0>

In [27]:
pl.scatter(samples[:,0],samples[:,1],
           c=-pred_density**2,linewidth=0)


Out[27]:
<matplotlib.collections.PathCollection at 0x1136c1090>

In [27]:


In [29]:
knn = neighbors.KNeighborsClassifier(1,metric='l1')
n = 100000
knn.fit(npr.randn(n,2),range(n))


Out[29]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='l1',
           metric_params=None, n_neighbors=1, p=2, weights='uniform')

In [8]:
from sklearn.cluster import AgglomerativeClustering
ag = AgglomerativeClustering(10)
ag.fit(samples[:1000])


Out[8]:
AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='ward',
            memory=Memory(cachedir=None), n_clusters=10, n_components=None,
            pooling_func=<function mean at 0x106b6f2a8>)

In [30]:


In [31]:
knn = neighbors.KNeighborsClassifier(1,metric='l1')
knn.fit(samples[:1000],ag.labels_)


Out[31]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='l1',
           metric_params=None, n_neighbors=1, p=2, weights='uniform')

In [32]:
knn.predict(samples)


Out[32]:
array([4, 3, 3, ..., 6, 2, 4])

In [33]:
pl.scatter(samples[:,0],samples[:,1],
           c=knn.predict(samples),
           linewidth=0,alpha=0.5)
pl.axis('off')


Out[33]:
(-10.0, 10.0, -10.0, 10.0)

In [34]:
pl.scatter(samples[:1000,0],samples[:1000,1],
           c=knn.predict(samples[:1000]),
           linewidth=0,alpha=0.5)
pl.axis('off')


Out[34]:
(-10.0, 10.0, -10.0, 10.0)

In [35]:
len(set(knn.predict(samples)))


Out[35]:
10

In [9]:
# compute distance graph

import networkx as nx

def distance(x,y):
    return np.sqrt(np.sum((x-y)**2))

def distance_graph(centers):
    G = nx.Graph()
    for i in range(len(centers)-1):
        for j in range(i+1,len(centers)):
            G.add_edge(i,j,weight=distance(centers[i],centers[j]))
    return G

In [32]:
def SPADE(X,
          num_clusters=50,
          density_estimator='k-nearest',
          # or 'kde' or 'eps-neighbors'
          k=20,
          r=1.0,
          num_plots=1,
          plot_overlay=False,
          native_layout=False,
          verbose=True):
    num_points = len(X)
    
    from time import time
    t0 = time()
    if verbose:
        print("Estimating density...")
        
    if density_estimator not in ['KDE','k-nearest','eps-neighbors']:
        print('invalid density_estimator, defaulting to k-nearest')
        density_estimator = 'k-nearest'
    
    if density_estimator == 'k-nearest':
        scores = local_density_k(X,k)
    if density_estimator == 'eps-neighbors':
        scores = local_density_r(X,r)
    if density_estimator == 'kde':
        
        from sklearn.grid_search import GridSearchCV

        # this snippet from documentation: 
        # http://scikit-learn.org/stable/auto_examples/neighbors/plot_digits_kde_sampling.html
        params = {'bandwidth': np.logspace(-1, 1, 20)}
        grid = GridSearchCV(KernelDensity(), params)
        grid.fit(samples)

        print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))

        # use the best estimator to compute the kernel density estimate
        kde = grid.best_estimator_
        # estimate density
        kde.fit(X)
        scores = kde.score_samples(X)
    
    if verbose:
        print("Done! Elapsed time: {0:.2f}s".format(time() - t0))
    
    # "down-sample"
    if verbose:
        print("Down-sampling...")
    accept_prob = 1 - (scores - scores.min()) / (scores.max() - scores.min())
    
    down_sampled_X = []
    down_sampled_i = []
    for i in range(num_points):
        if npr.rand() < accept_prob[i]:
            down_sampled_i.append(i)
            down_sampled_X.append(X[i])
    down_sampled_X = np.array(down_sampled_X)
    if verbose:
        print("Done! Reduced data from {1} to {2} points. Elapsed time: {0:.2f}s".format(time() - t0,len(X),len(down_sampled_X)))
    
        print("Clustering...")
    # cluster down-sampled data
    cluster_model = AgglomerativeClustering(num_clusters)
    cluster_model.fit(down_sampled_X)
    
    # compute cluster centers
    cluster_pred = cluster_model.labels_
    cluster_centers = []
    for i in range(num_clusters):
        cluster_centers.append(np.mean(down_sampled_X[cluster_pred==i],0))
    cluster_centers = np.array(cluster_centers)
    
    if verbose:
        print("Done! Elapsed time: {0:.2f}s".format(time() - t0))
    
        print("Up-sampling...")

    knn = neighbors.KNeighborsClassifier(1,metric='l1')
    knn.fit(down_sampled_X,cluster_pred)
    upsampled_cluster_pred = knn.predict(X)
    # "up-sample" (assign all points to nearest cluster center)
    #upsampled_cluster_pred = assign_to_nearest_landmark(X,cluster_centers)
    
    # compute normalized cluster occupancies
    occupancy = np.array([sum(upsampled_cluster_pred==i) for i in range(num_clusters)])
    norm_occupancy = (occupancy - occupancy.min()) / (occupancy.max() - occupancy.min())
    
    if verbose:
        print("Done! Elapsed time: {0:.2f}s".format(time() - t0))
    
        print("Computing MST...")
    # compute MST
    G = distance_graph(cluster_centers)
    mst = nx.minimum_spanning_tree(G)
    if verbose:
        print("Done! Elapsed time: {0:.2f}s".format(time() - t0))

    
        print("Plotting SPADE tree...")
    # plot SPADE tree
    if X.shape[1] == 2 and plot_overlay:
        print("Plotting overlay on data")
        pl.scatter(X[:,0],X[:,1],c=upsampled_cluster_pred,linewidths=0,alpha=0.5)
        pl.scatter(cluster_centers[:,0],cluster_centers[:,1]);
        for e in mst.edges():
            cpts = cluster_centers[np.array(e)]
            pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
        pl.axis('off')
        
        pl.figure()
    
    # set positions by force-directed layout
    pos = nx.graphviz_layout(mst)
    positions = np.zeros((len(pos),2))
    for p in pos:
        positions[p] = pos[p]
    
    # if the data is 2D, we can optionally use cluster centers
    if X.shape[1] == 2 and native_layout:
        positions = cluster_centers
    
    for e in mst.edges():
        cpts = positions[np.array(e)]
        pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
    #pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
    pl.scatter(positions[:,0],positions[:,1],
               c=cluster_centers[:,0],s=100+(200*norm_occupancy));

    pl.axis('off')
    
    print("Done! Total elapsed time: {0:.2f}s".format(time() - t0))
    
    return cluster_centers,mst

In [11]:
samples,density = generate_blobs(50000,10)

In [12]:
corners = [0,1]

In [13]:
import itertools
square = [i for i in itertools.product(corners,corners)]
square


Out[13]:
[(0, 0), (0, 1), (1, 0), (1, 1)]

In [59]:
cube = [i for i in itertools.product(corners,corners,corners)]
cube


Out[59]:
[(0, 0, 0),
 (0, 0, 1),
 (0, 1, 0),
 (0, 1, 1),
 (1, 0, 0),
 (1, 0, 1),
 (1, 1, 0),
 (1, 1, 1)]

In [14]:
def hypercube(ndim=3):
    corners = [-1,1]
    corner_list = [corners for _ in xrange(ndim)]
    return [i for i in itertools.product(*corner_list)]

In [15]:
h_corners = np.array(hypercube(5))*5
h_corners.shape


Out[15]:
(32, 5)

In [61]:
h_corners = np.array(hypercube(2))

In [62]:
branches = np.array(h_corners)
for i in range(20):
    branches = np.vstack((branches, h_corners*(i+2)))
branches = np.vstack((branches,np.zeros(branches.shape[1])))

In [65]:
import matplotlib.pyplot as pl
from mpl_toolkits.mplot3d import Axes3D

if branches.shape[1] > 2:
    fig = pl.figure()
    ax = fig.add_subplot(111, projection='3d')

    ax.scatter(branches[:,0],branches[:,1],branches[:,2])
else:
    pl.scatter(branches[:,0],branches[:,1])
pl.show()

In [57]:
kde = KernelDensity(bandwidth=0.5)
kde.fit(branches)
b_samples = kde.sample(10000)

In [66]:
if branches.shape[1] > 2:
    fig = pl.figure()
    ax = fig.add_subplot(111, projection='3d')

    ax.scatter(b_samples[:,0],b_samples[:,1],b_samples[:,2])
else:
    pl.scatter(b_samples[:,0],b_samples[:,1])
pl.show()

In [ ]:


In [69]:
results = SPADE(b_samples)
pl.show()


Estimating density...
Done! Elapsed time: 0.06s
Down-sampling...
Done! Reduced data from 10000 to 2883 points. Elapsed time: 0.07s
Clustering...
Done! Elapsed time: 8.29s
Up-sampling...
Done! Elapsed time: 9.11s
Computing MST...
Done! Elapsed time: 9.12s
Plotting SPADE tree...
Done! Total elapsed time: 9.26s

In [68]:


In [116]:
distance(h_corners[0],h_corners[-1]),distance(h_corners[0],h_corners[1])


Out[116]:
(22.360679774997898, 10.0)

In [19]:



Out[19]:
KernelDensity(algorithm='auto', atol=0, bandwidth=1.0, breadth_first=True,
       kernel='gaussian', leaf_size=40, metric='euclidean',
       metric_params=None, rtol=0)

In [39]:
sizes = [1,2,3,4,5]
dim = [2,3,4]
for (size,d) in itertools.product(sizes,dim):
    h_corners = np.array(hypercube(d))*(size/2.0)
    h_corners.shape
    kde = KernelDensity()
    kde.fit(h_corners)
    h_samples = kde.sample(10000,random_state=0)
    fig = pl.figure()
    for i in range(4):
        pl.subplot(2,2,i+1)
        results = SPADE(h_samples,verbose=False)
    fig.set_size_inches(10,10)
    pl.savefig('{0}D, L={1} hypercube.pdf'.format(d,size))


Done! Total elapsed time: 0.92s
Done! Total elapsed time: 0.96s
Done! Total elapsed time: 0.98s
Done! Total elapsed time: 0.99s
Done! Total elapsed time: 1.10s
Done! Total elapsed time: 1.11s
Done! Total elapsed time: 1.09s
Done! Total elapsed time: 1.15s
Done! Total elapsed time: 2.19s
Done! Total elapsed time: 2.24s
Done! Total elapsed time: 2.21s
Done! Total elapsed time: 2.28s
Done! Total elapsed time: 0.91s
Done! Total elapsed time: 0.90s
Done! Total elapsed time: 0.96s
Done! Total elapsed time: 0.96s
Done! Total elapsed time: 1.18s
Done! Total elapsed time: 1.16s
Done! Total elapsed time: 1.15s
Done! Total elapsed time: 1.17s
Done! Total elapsed time: 1.98s
Done! Total elapsed time: 2.03s
Done! Total elapsed time: 2.09s
Done! Total elapsed time: 1.99s
Done! Total elapsed time: 0.92s
Done! Total elapsed time: 1.01s
Done! Total elapsed time: 0.98s
Done! Total elapsed time: 1.00s
Done! Total elapsed time: 1.23s
Done! Total elapsed time: 1.25s
Done! Total elapsed time: 1.21s
Done! Total elapsed time: 1.28s
Done! Total elapsed time: 2.74s
Done! Total elapsed time: 2.47s
Done! Total elapsed time: 2.63s
Done! Total elapsed time: 2.59s
Done! Total elapsed time: 0.90s
Done! Total elapsed time: 0.95s
Done! Total elapsed time: 0.91s
Done! Total elapsed time: 0.91s
Done! Total elapsed time: 1.41s
Done! Total elapsed time: 2.09s
Done! Total elapsed time: 1.35s
Done! Total elapsed time: 1.39s
Done! Total elapsed time: 4.30s
Done! Total elapsed time: 4.08s
Done! Total elapsed time: 4.04s
Done! Total elapsed time: 4.11s
Done! Total elapsed time: 0.97s
Done! Total elapsed time: 0.99s
Done! Total elapsed time: 0.93s
Done! Total elapsed time: 0.98s
Done! Total elapsed time: 1.51s
Done! Total elapsed time: 1.61s
Done! Total elapsed time: 1.48s
Done! Total elapsed time: 1.56s
Done! Total elapsed time: 5.68s
Done! Total elapsed time: 5.36s
Done! Total elapsed time: 5.14s
Done! Total elapsed time: 5.16s

In [27]:


In [24]:
G = distance_graph(h_corners)
mst = nx.minimum_spanning_tree(G)

In [25]:
pos = nx.graphviz_layout(mst)
positions = np.zeros((len(pos),2))
for p in pos:
    positions[p] = pos[p]


Out[25]:
{0: (-37.927, -29.1),
 1: (52.738, -7.4849),
 2: (-136.23, -12.654),
 3: (98.196, 82.279),
 4: (-51.365, -116.95),
 5: (135.73, -59.545),
 6: (-222.87, -32.112),
 7: (170.7, 138.45),
 8: (-56.021, 57.946),
 9: (69.615, -84.481),
 10: (-179.7, 63.573),
 11: (71.09, 161.21),
 12: (-42.749, -205.81),
 13: (189.83, -129.63),
 14: (-300.16, -5.1818),
 15: (200.96, 214.82),
 16: (-65.881, 6.2453),
 17: (99.437, 1.8192),
 18: (-166.19, -79.43),
 19: (170.59, 70.193),
 20: (-106.3, -173.21),
 21: (216.03, -52.463),
 22: (-271.6, -102.4),
 23: (256.68, 132.02),
 24: (-88.421, 137.54),
 25: (100.79, -165.81),
 26: (-229.25, 131.46),
 27: (59.293, 243.89),
 28: (-57.89, -286.98),
 29: (251.77, -181.27),
 30: (-376.51, 5.009),
 31: (245.62, 278.07)}

In [50]:
results = SPADE(samples)


Estimating density...
Done! Elapsed time: 0.31s
Down-sampling...
Done! Reduced data from 50000 to 1766 points. Elapsed time: 0.33s
Clustering...
Done! Elapsed time: 2.18s
Up-sampling...
Done! Elapsed time: 6.68s
Computing MST...
Done! Elapsed time: 6.69s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 6.94s

In [ ]:
results = SPADE(samples,density_estimator='eps-neighbors')


Estimating density...
Done! Elapsed time: 5.97s

In [92]:
# for reproducibility
npr.seed(2)

In [93]:
# generate some synthetic data
num_centers = 10
dim = 2
centers = np.zeros((num_centers,dim))
sizes = (npr.rand(num_centers)+0.5)

num_points = 10000

for i in xrange(1,num_centers):
    centers[i] = centers[i-1] + npr.randn(dim)*3

    
X = np.zeros((num_points,dim))
Y = np.zeros(num_points)
for i in range(num_points):
    c = npr.randint(num_centers)
    X[i] = npr.randn(dim)*sizes[c] + centers[c]
    Y[i] = c

In [94]:
pl.scatter(X[:,0],X[:,1],c=Y,linewidths=0,alpha=0.5)
pl.scatter(centers[:,0],centers[:,1],c=range(num_centers))
for i in range(num_centers-1):
    cpts = centers[i:i+2]
    #pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
pl.axis('off')


Out[94]:
(-8.0, 8.0, -6.0, 10.0)

In [6]:
# to do: replace random walk with a branching process

In [96]:
from sklearn.neighbors import KernelDensity

In [97]:
bandwidth=0.5
kde = KernelDensity(bandwidth)

In [98]:
kde.fit(X)
scores = kde.score_samples(X)

In [99]:
scores.min(),scores.max()


Out[99]:
(-8.6331214511766312, -3.3486856194786867)

In [100]:
sigmoid = lambda x : (1 / (1 + np.exp(-x)))

In [101]:
accept_prob = 1 - (scores - scores.min()) / (scores.max() - scores.min())
accept_prob_s = sigmoid(accept_prob)
print(accept_prob.min(),accept_prob.max())
print(accept_prob_s.min(),accept_prob_s.max())


(0.0, 1.0)
(0.5, 0.7310585786300049)

In [102]:
pl.scatter(X[:,0],X[:,1],c=(np.log(accept_prob+1)+0.3),linewidths=0,alpha=0.5)
#pl.colorbar();
pl.scatter(centers[:,0],centers[:,1]);
pl.axis('off')


Out[102]:
(-8.0, 8.0, -6.0, 10.0)

In [103]:
down_sampled_X = []
down_sampled_Y = []
down_sampled_i = []
for i in range(num_points):
    if npr.rand() < accept_prob[i]:
        down_sampled_i.append(i)
        down_sampled_X.append(X[i])
        down_sampled_Y.append(Y[i])
down_sampled_X = np.array(down_sampled_X)
down_sampled_Y = np.array(down_sampled_Y)

In [104]:
len(X)


Out[104]:
10000

In [105]:
len(down_sampled_X)


Out[105]:
1718

In [106]:
kde_2 = KernelDensity(bandwidth)
kde_2.fit(down_sampled_X)
new_densities = kde_2.score_samples(down_sampled_X)

In [107]:
new_densities.shape


Out[107]:
(1718,)

In [108]:
pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=-new_densities.max() - new_densities,linewidths=0,alpha=0.5)
#pl.colorbar();
pl.scatter(centers[:,0],centers[:,1]);
pl.axis('off')


Out[108]:
(-8.0, 8.0, -6.0, 10.0)

In [109]:
from sklearn.cluster import AgglomerativeClustering

In [110]:
cluster_model = AgglomerativeClustering(10)
cluster_model.fit(down_sampled_X)


Out[110]:
AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='ward',
            memory=Memory(cachedir=None), n_clusters=10, n_components=None,
            pooling_func=<function mean at 0x106c842a8>)

In [111]:
cluster_pred = cluster_model.labels_

In [112]:
pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=Y[down_sampled_i],linewidths=0,alpha=0.5)
#pl.colorbar();
pl.scatter(centers[:,0],centers[:,1]);
pl.axis('off')


Out[112]:
(-8.0, 8.0, -6.0, 10.0)

In [113]:
cluster_centers = []
for i in range(len(set(cluster_pred))):
    cluster_centers.append(np.mean(down_sampled_X[cluster_pred==i],0))
cluster_centers = np.array(cluster_centers)

In [114]:
pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(cluster_centers[:,0],cluster_centers[:,1]);
pl.axis('off')


Out[114]:
(-8.0, 8.0, -6.0, 10.0)

In [26]:
cluster_true = Y[down_sampled_i]

In [27]:
from sklearn import metrics

In [28]:
# for just the downsampled points
metrics.adjusted_mutual_info_score(cluster_true,cluster_pred)


Out[28]:
0.6368748721948998

In [121]:
def assign_to_nearest_landmark(points,landmarks):
    nearest = np.zeros(len(points))
    for i,point in enumerate(points):
        distances = np.zeros(len(landmarks))
        for j,landmark in enumerate(landmarks):
            distances[j] = distance(point,landmark)
        nearest[i] = np.argmin(distances)
    return nearest

In [122]:
upsampled_cluster_pred = assign_to_nearest_landmark(X,cluster_centers)

In [123]:
knn = neighbors.KNeighborsClassifier(1,metric='l1')
knn.fit(down_sampled_X,cluster_pred)


Out[123]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='l1',
           metric_params=None, n_neighbors=1, p=2, weights='uniform')

In [124]:
upsampled_cluster_pred = knn.predict(X)

In [125]:
pl.scatter(X[:,0],X[:,1],c=upsampled_cluster_pred,linewidths=0,alpha=0.5)
#pl.colorbar();
pl.scatter(cluster_centers[:,0],cluster_centers[:,1]);
pl.axis('off')


Out[125]:
(-8.0, 8.0, -6.0, 10.0)

In [126]:
# for all points
metrics.adjusted_mutual_info_score(Y,upsampled_cluster_pred)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-126-b4da259c7fea> in <module>()
      1 # for all points
----> 2 metrics.adjusted_mutual_info_score(Y,upsampled_cluster_pred)

NameError: name 'metrics' is not defined

In [127]:
from sklearn.cluster import KMeans

In [128]:
kmeans_model = KMeans(10)

In [129]:
kmeans_model.fit(X)


Out[129]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=10, n_init=10,
    n_jobs=1, precompute_distances=True, random_state=None, tol=0.0001,
    verbose=0)

In [130]:
kmeans_pred = kmeans_model.predict(X)
metrics.adjusted_mutual_info_score(Y,kmeans_pred)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-130-7b276bbd745a> in <module>()
      1 kmeans_pred = kmeans_model.predict(X)
----> 2 metrics.adjusted_mutual_info_score(Y,kmeans_pred)

NameError: name 'metrics' is not defined

In [132]:
G = distance_graph(centers)
mst = nx.minimum_spanning_tree(G)

In [133]:
mst.edges()


Out[133]:
[(0, 7), (1, 5), (2, 3), (2, 5), (2, 7), (4, 5), (5, 6), (7, 8), (8, 9)]

In [134]:
pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=Y[down_sampled_i],linewidths=0,alpha=0.5)
pl.scatter(centers[:,0],centers[:,1]);
for e in mst.edges():
    cpts = centers[np.array(e)]
    pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
pl.axis('off')


Out[134]:
(-8.0, 8.0, -6.0, 10.0)

In [135]:
G = distance_graph(cluster_centers)
mst = nx.minimum_spanning_tree(G)

pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(cluster_centers[:,0],cluster_centers[:,1]);
for e in mst.edges():
    cpts = cluster_centers[np.array(e)]
    pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
pl.axis('off')


Out[135]:
(-8.0, 8.0, -6.0, 10.0)

In [136]:
occupancy = np.array([sum(upsampled_cluster_pred==i) for i in range(len(set(upsampled_cluster_pred)))])
norm_occupancy = (occupancy - occupancy.min()) / (occupancy.max() - occupancy.min())

In [137]:
norm_occupancy.max()


Out[137]:
1

In [143]:
G = distance_graph(cluster_centers)
mst = nx.minimum_spanning_tree(G)
for e in mst.edges():
    cpts = cluster_centers[np.array(e)]
    pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(cluster_centers[:,0],cluster_centers[:,1],
           c=cluster_centers[:,0],
           s=100+(200*norm_occupancy));

pl.axis('off')


Out[143]:
(-8.0, 8.0, -6.0, 10.0)

In [46]:
occupancy = np.array([sum(Y==i) for i in range(len(set(upsampled_cluster_pred)))])
norm_occupancy = (occupancy - occupancy.min()) / (occupancy.max() - occupancy.min())

In [47]:
G = distance_graph(centers)
mst = nx.minimum_spanning_tree(G)
for e in mst.edges():
    cpts = centers[np.array(e)]
    pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
#pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(centers[:,0],centers[:,1],
           c=centers[:,1],s=100+(200*norm_occupancy));
pl.axis('off')


Out[47]:
(-5.0, 4.0, -4.0, 8.0)

Re-doing the thing for a hypothesis of 50 clusters instead


In [86]:
results = SPADE(X[:1000])


Estimating density...
Done! Elapsed time: 0.05s
Down-sampling...
Done! Reduced data from 1000 to 251 points. Elapsed time: 0.05s
Clustering...
Done! Elapsed time: 0.06s
Up-sampling...
Done! Elapsed time: 0.43s
Computing MST...
Done! Elapsed time: 0.44s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 0.63s

In [ ]:


In [ ]:


In [227]:
pl.scatter(samples[:,0],samples[:,1],linewidths=0,
           c=kde.score_samples(samples))
pl.axis('off')


Out[227]:
(-30.0, 30.0, -30.0, 30.0)

In [228]:
for i in range(10):
    pl.figure()
    results = SPADE(samples)


Estimating density...
Done! Elapsed time: 0.70s
Down-sampling...
Done! Reduced data from 5000 to 1019 points. Elapsed time: 0.70s
Clustering...
Done! Elapsed time: 1.04s
Up-sampling...
Done! Elapsed time: 2.74s
Computing MST...
Done! Elapsed time: 2.75s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 2.94s
Estimating density...
Done! Elapsed time: 0.67s
Down-sampling...
Done! Reduced data from 5000 to 1087 points. Elapsed time: 0.67s
Clustering...
Done! Elapsed time: 1.08s
Up-sampling...
Done! Elapsed time: 2.82s
Computing MST...
Done! Elapsed time: 2.83s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 3.02s
Estimating density...
Done! Elapsed time: 0.69s
Down-sampling...
Done! Reduced data from 5000 to 1033 points. Elapsed time: 0.69s
Clustering...
Done! Elapsed time: 1.03s
Up-sampling...
Done! Elapsed time: 2.79s
Computing MST...
Done! Elapsed time: 2.80s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 3.22s
Estimating density...
Done! Elapsed time: 0.70s
Down-sampling...
Done! Reduced data from 5000 to 1064 points. Elapsed time: 0.70s
Clustering...
Done! Elapsed time: 1.09s
Up-sampling...
Done! Elapsed time: 2.87s
Computing MST...
Done! Elapsed time: 2.89s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 3.08s
Estimating density...
Done! Elapsed time: 0.70s
Down-sampling...
Done! Reduced data from 5000 to 1087 points. Elapsed time: 0.70s
Clustering...
Done! Elapsed time: 1.12s
Up-sampling...
Done! Elapsed time: 2.93s
Computing MST...
Done! Elapsed time: 2.95s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 3.14s
Estimating density...
Done! Elapsed time: 0.69s
Down-sampling...
Done! Reduced data from 5000 to 1046 points. Elapsed time: 0.70s
Clustering...
Done! Elapsed time: 1.07s
Up-sampling...
Done! Elapsed time: 2.85s
Computing MST...
Done! Elapsed time: 2.86s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 3.04s
Estimating density...
Done! Elapsed time: 0.66s
Down-sampling...
Done! Reduced data from 5000 to 1025 points. Elapsed time: 0.67s
Clustering...
Done! Elapsed time: 1.01s
Up-sampling...
Done! Elapsed time: 2.79s
Computing MST...
Done! Elapsed time: 2.80s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 2.99s
Estimating density...
Done! Elapsed time: 0.70s
Down-sampling...
Done! Reduced data from 5000 to 1045 points. Elapsed time: 0.70s
Clustering...
Done! Elapsed time: 1.05s
Up-sampling...
Done! Elapsed time: 2.75s
Computing MST...
Done! Elapsed time: 2.77s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 2.95s
Estimating density...
Done! Elapsed time: 0.70s
Down-sampling...
Done! Reduced data from 5000 to 1027 points. Elapsed time: 0.70s
Clustering...
Done! Elapsed time: 1.05s
Up-sampling...
Done! Elapsed time: 2.85s
Computing MST...
Done! Elapsed time: 2.86s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 3.16s
Estimating density...
Done! Elapsed time: 0.70s
Down-sampling...
Done! Reduced data from 5000 to 1051 points. Elapsed time: 0.70s
Clustering...
Done! Elapsed time: 1.08s
Up-sampling...
Done! Elapsed time: 2.82s
Computing MST...
Done! Elapsed time: 2.83s
Plotting SPADE tree...
Plotting overlay on data
Done! Total elapsed time: 3.02s

In [72]:
cluster_pred = cluster_model.labels_

In [73]:
cluster_centers = []
for i in range(len(set(cluster_pred))):
    cluster_centers.append(np.mean(down_sampled_X[cluster_pred==i],0))
cluster_centers = np.array(cluster_centers)

In [74]:
upsampled_cluster_pred = assign_to_nearest_landmark(X,cluster_centers)

In [75]:
pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(cluster_centers[:,0],cluster_centers[:,1]);
pl.axis('off')


Out[75]:
(-8.0, 8.0, -6.0, 10.0)

In [76]:
pl.scatter(X[:,0],X[:,1],c=upsampled_cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(cluster_centers[:,0],cluster_centers[:,1]);
pl.axis('off')


Out[76]:
(-8.0, 8.0, -6.0, 10.0)

In [77]:
G = distance_graph(cluster_centers)
mst = nx.minimum_spanning_tree(G)

pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(cluster_centers[:,0],cluster_centers[:,1]);
for e in mst.edges():
    cpts = cluster_centers[np.array(e)]
    pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
pl.axis('off')


Out[77]:
(-8.0, 8.0, -6.0, 10.0)

In [78]:
occupancy = np.array([sum(Y==i) for i in range(len(set(upsampled_cluster_pred)))])
norm_occupancy = (occupancy - occupancy.min()) / (occupancy.max() - occupancy.min())

In [79]:
G = distance_graph(cluster_centers)
mst = nx.minimum_spanning_tree(G)
for e in mst.edges():
    cpts = cluster_centers[np.array(e)]
    pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
#pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(cluster_centers[:,0],cluster_centers[:,1],
           c=cluster_centers[:,1],s=100+(200*norm_occupancy));
pl.axis('off')


Out[79]:
(-6.0, 8.0, -6.0, 10.0)

In [135]:
#pos = nx.spectral_layout(mst)
pos = nx.graphviz_layout(mst)

In [136]:
positions = np.zeros((len(pos),2))
for p in pos:
    positions[p] = pos[p]

In [ ]:


In [140]:
G = distance_graph(cluster_centers)
mst = nx.minimum_spanning_tree(G)
for e in mst.edges():
    cpts = positions[np.array(e)]
    pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
#pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(positions[:,0],positions[:,1],
           c=cluster_centers[:,0],
           s=100+(200*norm_occupancy));
pl.axis('off')


Out[140]:
(-1000.0, 1500.0, -600.0, 600.0)

In [104]:
pl.scatter(positions[:,0],positions[:,1])


Out[104]:
<matplotlib.collections.PathCollection at 0x1084e6150>

In [ ]: